Download data

load('./data/df_combine.Rdata')

Incidence

df_exercise = df_combine %>% 
  mutate(exercise = as.numeric(exercise),
         total_exercise = exercise * exercise_time,
         gender = as.factor(gender))
df_incidence  <- df_exercise %>% 
  mutate(tb = fct_recode(tb, '1'= 'Yes', '0'='No')) %>% 
  mutate(tb=as.character(tb),
         tb=as.numeric(tb)) %>% 
  group_by(total_exercise) %>% 
  summarise(tb_sum = sum(tb),
           incidence = tb_sum/n()) %>% 
  ungroup()

K-M plot for different exercise people

df_exercise = df_exercise %>% 
  inner_join(df_incidence, by = "total_exercise")

df_exercise$survival = with(df_exercise, Surv(days, tb == "Yes"))

km <- survfit(survival ~ 1, data = df_exercise, conf.type = "log-log")
km_by_exercise <- survfit(survival ~ exercise, data = df_exercise, conf.type = "log-log")

km_by_exercise_time <- survfit(survival ~ exercise_time, data = df_exercise, conf.type = "log-log")

plot_exercise_level <- GGally::ggsurv(km_by_exercise, main = "Kaplan-Meier Curve for getting TB of different exercise level")
plotly::ggplotly(plot_exercise_level)
plot_exercise_time <- GGally::ggsurv(km_by_exercise_time, main = "Kaplan-Meier Curve for getting TB of different exercise time")
plotly::ggplotly(plot_exercise_time)

The plot shows that people with lower exercise level have higher probability of getting TB, which show that exercise is important to preventing TB. However, people with over 2 hours exercise time have higher probability of getting TB. That may be because that people spending more time on exercise have more probability touching other people and getting infection.

Show exercise difference between gender

plot_exer<-ggplot(df_exercise, aes(x = dmage, y =total_exercise, colour=dmage)) +
    geom_histogram(stat = "identity", width = .6) +
    labs(title="The average exercise vs age",
         x = "age") +
    theme(axis.title.x =  element_blank(),
          axis.text.x  =  element_blank(), 
          axis.title.y = element_text(face="bold", size=12),
          axis.text.y  = element_text(angle=0, vjust=0.5, size=10),
          legend.title = element_text(size=12, face="bold"),
          legend.text = element_text(size = 12, face = "bold"))+
    facet_wrap(~gender)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(plot_exer)

Average exercise of female is slightly higher than male.

map

x = df_exercise %>% 
  group_by(district) %>% 
  summarise(exercise_sum = sum(total_exercise)) %>% 
  ungroup()

levels(x$district) <- list("Huangpu" = 310101, "Xuhui" = 310104, "Changning" = c(310105, 310106), "Putuo" = 310107, "Zhabei" = 310108, "Hongkou" = 310109, "Yangpu" = 310110, "Minhang" = 310112, "Baoshan" = 310113,  "Pudong" = c(310115, 310119), "Jiading" = 310114, "Jinshan" = 310116, "Songjiang" = 310117, "Qingpu" = 310118, "Fengxian" = 310120, "Chongming" = 310230)
sh <- readOGR('./data/shanghai_shapefile/shang_dis_merged.shp',verbose = F)
## Warning in readOGR("./data/shanghai_shapefile/shang_dis_merged.shp",
## verbose = F): Z-dimension discarded
##Translating and Adding two missing two districts
sh@data <- sh@data %>% 
  mutate(Name = as.factor(Name)) %>%
  mutate(Name = fct_recode(Name, Jiading = '嘉定区',
                           Fengxian = '奉贤区',
                           Baoshan = '宝山区',
                           Chongming = '崇明县',
                           Xuhui = '徐汇区',
                           Putuo ='普陀区',
                           Yangpu = '杨浦区',
                           Songjiang = '松江区',
                           Pudong='浦东新区',
                           Hongkou = '虹口区',
                           Jinshan = '金山区',
                           Changning = '长宁区',
                           Minhang = '闵行区',
                           Zhabei = '闸北区',
                           Qingpu = '青浦区', 
                           Huangpu = '黄浦区'))
## Warning: Unknown levels in `f`: <U+5609><U+5B9A><U+533A>, <U+5949><U
## +8D24><U+533A>, <U+5B9D><U+5C71><U+533A>, <U+5D07><U+660E><U+53BF>, <U
## +5F90><U+6C47><U+533A>, <U+666E><U+9640><U+533A>, <U+6768><U+6D66><U+533A>,
## <U+677E><U+6C5F><U+533A>, <U+6D66><U+4E1C><U+65B0><U+533A>, <U+8679><U
## +53E3><U+533A>, <U+91D1><U+5C71><U+533A>, <U+957F><U+5B81><U+533A>, <U
## +95F5><U+884C><U+533A>, <U+95F8><U+5317><U+533A>, <U+9752><U+6D66><U+533A>,
## <U+9EC4><U+6D66><U+533A>
sh$exercise_sum <- x$exercise_sum

qtm(sh, fill = "exercise_sum")
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3

People from Minxing district has the highest exercise.